from utils import Activities, Users
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
pd.options.plotting.backend = "plotly"
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ptitprince as pt
from utils import join_by_fuzzy
import info_utilities
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from pprint import pprint
import pydot
from IPython.display import Image
import warnings
#warnings.filterwarnings("ignore")
pd.options.display.max_columns = None
pd.options.display.max_rows = None
data_folder = "../data/"
#plotly style:
line_traces = dict(mode='lines+markers',line_shape='spline',line_smoothing=1,marker_size=10,marker_opacity=0.9)
students = pd.read_csv(f"{data_folder}students.csv",sep="\t")
students_wo_grades = students[students[['E_grade']].isnull().any(axis=1)]
with_null = len(students_wo_grades)
print(f'There are {with_null}/{len(students)} students without estimate grade ({round(with_null/len(students)*100,2)}%)')
students.drop(students_wo_grades.index, axis=0, inplace=True)
q_25 = students['E_grade'].quantile(0.25)
q_75 = students['E_grade'].quantile(0.75)
students['E_result'] = np.where(students['E_grade']<q_25, 'Failing', 'Average')
students['E_result'].where(students['E_grade']<q_75, 'Good',inplace=True)
grade_map = {"Good": "green", "Failing": "red", "Average": "orange"}
order_map = {"Good": 3, "Failing": 1, "Average": 2}
students['grade_order'] = students["E_result"].map(order_map).astype("float64")
students.sort_values("grade_order",inplace=True)
students['E_grade'].describe()
students['E_activities_per_school_year'] = students[['n_activities_school_year_1','n_activities_school_year_2','n_activities_school_year_3']].mean(axis=1)
students['E_in_curriculum_per_semester'] = students[['n_in_curriculum_semester1','n_in_curriculum_semester2','n_in_curriculum_semester3','n_in_curriculum_semester4','n_in_curriculum_semester5']].mean(axis=1)
exclude_cols = ['us_user','user_name','user_email','start_year','start_semester','archived','user_type','contract_type','HGF','convocatore','teacher','supervisor','ispettore','statista','student','classes','companies','avg_specific_evaluations','avg_supervisor_evaluation','n_received_feedback_requests','n_feedback_responses']
exclude_cols += ['grade_1st','grade_2nd','grade_3rd','E_grade','E_result','final_CP','final_LP','final_IP','final_grade']
exclude_cols += ['n_activities_school_year_1','n_activities_school_year_2','n_activities_school_year_3']
exclude_cols += ['n_in_curriculum_semester1','n_in_curriculum_semester2','n_in_curriculum_semester3','n_in_curriculum_semester4','n_in_curriculum_semester5']
features = list(set(students.columns) - set(exclude_cols))
fig = students[features].boxplot(height=800)
fig.show()
rain_columns = features
n_row = len(rain_columns)
f, axes = plt.subplots(n_row, 2, figsize=(18, 10*n_row), dpi=300)
for i,col_name in enumerate(rain_columns):
row_axe = int(i/2)
col_axe = i%2
ax = pt.RainCloud(x = 'E_result', y = col_name,
data = students, orient = 'h',
move = .0,alpha = .65, ax = axes[i,0], palette = "Set1")
ax = sns.scatterplot(x="E_grade", y=col_name, hue="E_result", data=students,ax=axes[i,1], palette="Set1");
corr_var = 'n_feedback_requests'
reg = LinearRegression().fit(students[corr_var].values.reshape(-1, 1) , students['E_grade'])
print(reg.coef_)
px.scatter(students, x=corr_var,y="E_grade")
fig = go.Figure()
# Add traces
fig.add_trace(go.Scatter(x=students[corr_var], y=students['E_grade'],
mode='markers',
name='raw',
marker_color=students['E_result'].map(grade_map),
),
)
fig.add_trace(go.Scatter(x=students[corr_var], y=reg.predict(students[corr_var].values.reshape(-1, 1)),
mode='lines',
name='predictions'))
corr_var = 'n_in_curriculum'
reg = LinearRegression().fit(students[corr_var].values.reshape(-1, 1) , students['E_grade'])
print(reg.coef_)
px.scatter(students, x=corr_var,y="E_grade")
fig = go.Figure()
# Add traces
fig.add_trace(go.Scatter(x=students[corr_var], y=students['E_grade'],
mode='markers',
name='raw'))
fig.add_trace(go.Scatter(x=students[corr_var], y=reg.predict(students[corr_var].values.reshape(-1, 1)),
mode='lines',
name='predictions'))
corr_var = 'n_activities'
reg = LinearRegression().fit(students[corr_var].values.reshape(-1, 1) , students['E_grade'])
print(reg.coef_)
px.scatter(students, x=corr_var,y="E_grade")
fig = go.Figure()
# Add traces
fig.add_trace(go.Scatter(x=students[corr_var], y=students['E_grade'],
mode='markers',
name='raw'))
fig.add_trace(go.Scatter(x=students[corr_var], y=reg.predict(students[corr_var].values.reshape(-1, 1)),
mode='lines',
name='predictions'))
corr_var = 'n_folders'
reg = LinearRegression().fit(students[corr_var].values.reshape(-1, 1) , students['E_grade'])
print(reg.coef_)
px.scatter(students, x=corr_var,y="E_grade")
fig = go.Figure()
# Add traces
fig.add_trace(go.Scatter(x=students[corr_var], y=students['E_grade'],
mode='markers',
name='raw'))
fig.add_trace(go.Scatter(x=students[corr_var], y=reg.predict(students[corr_var].values.reshape(-1, 1)),
mode='lines',
name='predictions'))